{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extract labels from the evaluation files\n",
    "\n",
    "Test for one file first"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "# first test with one file\n",
    "file_path = 'data/IEMOCAP_full_release/Session1/dialog/EmoEvaluation/Ses01F_impro01.txt'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "useful_regex = re.compile(r'\\[.+\\]\\n', re.IGNORECASE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(file_path) as f:\n",
    "    file_content = f.read()\n",
    "    \n",
    "info_lines = re.findall(useful_regex, file_content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['[6.2901 - 8.2357]', 'Ses01F_impro01_F000', 'neu', '[2.5000, 2.5000, 2.5000]']\n",
      "['[10.0100 - 11.3925]', 'Ses01F_impro01_F001', 'neu', '[2.5000, 2.5000, 2.5000]']\n",
      "['[14.8872 - 18.0175]', 'Ses01F_impro01_F002', 'neu', '[2.5000, 2.5000, 2.5000]']\n",
      "['[19.2900 - 20.7875]', 'Ses01F_impro01_F003', 'xxx', '[2.5000, 3.0000, 3.0000]']\n",
      "['[21.3257 - 24.7400]', 'Ses01F_impro01_F004', 'xxx', '[2.5000, 3.0000, 2.5000]']\n",
      "['[27.4600 - 31.4900]', 'Ses01F_impro01_F005', 'neu', '[2.5000, 3.5000, 2.0000]']\n",
      "['[38.9650 - 43.5900]', 'Ses01F_impro01_F006', 'fru', '[2.0000, 3.5000, 3.5000]']\n",
      "['[46.5800 - 52.1900]', 'Ses01F_impro01_F007', 'fru', '[2.5000, 3.5000, 3.5000]']\n",
      "['[56.1600 - 58.8225]', 'Ses01F_impro01_F008', 'fru', '[2.0000, 3.5000, 3.5000]']\n"
     ]
    }
   ],
   "source": [
    "for l in info_lines[1:10]:\n",
    "    print(l.strip().split('\\t'))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compile all the information in a single file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import os\n",
    "\n",
    "\n",
    "info_line = re.compile(r'\\[.+\\]\\n', re.IGNORECASE)\n",
    "\n",
    "start_times, end_times, wav_file_names, emotions, vals, acts, doms = [], [], [], [], [], [], []\n",
    "\n",
    "for sess in range(1, 6):\n",
    "    emo_evaluation_dir = 'data/IEMOCAP_full_release/Session{}/dialog/EmoEvaluation/'.format(sess)\n",
    "    evaluation_files = [l for l in os.listdir(emo_evaluation_dir) if 'Ses' in l]\n",
    "    for file in evaluation_files:\n",
    "        with open(emo_evaluation_dir + file) as f:\n",
    "            content = f.read()\n",
    "        info_lines = re.findall(info_line, content)\n",
    "        for line in info_lines[1:]:  # the first line is a header\n",
    "            start_end_time, wav_file_name, emotion, val_act_dom = line.strip().split('\\t')\n",
    "            start_time, end_time = start_end_time[1:-1].split('-')\n",
    "            val, act, dom = val_act_dom[1:-1].split(',')\n",
    "            val, act, dom = float(val), float(act), float(dom)\n",
    "            start_time, end_time = float(start_time), float(end_time)\n",
    "            start_times.append(start_time)\n",
    "            end_times.append(end_time)\n",
    "            wav_file_names.append(wav_file_name)\n",
    "            emotions.append(emotion)\n",
    "            vals.append(val)\n",
    "            acts.append(act)\n",
    "            doms.append(dom)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>start_time</th>\n",
       "      <th>end_time</th>\n",
       "      <th>wav_file</th>\n",
       "      <th>emotion</th>\n",
       "      <th>val</th>\n",
       "      <th>act</th>\n",
       "      <th>dom</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>10034</th>\n",
       "      <td>358.10</td>\n",
       "      <td>365.26</td>\n",
       "      <td>Ses05F_impro05_M049</td>\n",
       "      <td>fru</td>\n",
       "      <td>2.5</td>\n",
       "      <td>3.5</td>\n",
       "      <td>4.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10035</th>\n",
       "      <td>365.30</td>\n",
       "      <td>370.53</td>\n",
       "      <td>Ses05F_impro05_M050</td>\n",
       "      <td>neu</td>\n",
       "      <td>2.5</td>\n",
       "      <td>3.5</td>\n",
       "      <td>4.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10036</th>\n",
       "      <td>371.63</td>\n",
       "      <td>374.16</td>\n",
       "      <td>Ses05F_impro05_M051</td>\n",
       "      <td>neu</td>\n",
       "      <td>3.0</td>\n",
       "      <td>2.5</td>\n",
       "      <td>2.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10037</th>\n",
       "      <td>375.10</td>\n",
       "      <td>385.14</td>\n",
       "      <td>Ses05F_impro05_M052</td>\n",
       "      <td>neu</td>\n",
       "      <td>3.5</td>\n",
       "      <td>3.0</td>\n",
       "      <td>3.5</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10038</th>\n",
       "      <td>386.39</td>\n",
       "      <td>388.27</td>\n",
       "      <td>Ses05F_impro05_M053</td>\n",
       "      <td>neu</td>\n",
       "      <td>4.0</td>\n",
       "      <td>2.5</td>\n",
       "      <td>3.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "       start_time  end_time             wav_file emotion  val  act  dom\n",
       "10034      358.10    365.26  Ses05F_impro05_M049     fru  2.5  3.5  4.5\n",
       "10035      365.30    370.53  Ses05F_impro05_M050     neu  2.5  3.5  4.0\n",
       "10036      371.63    374.16  Ses05F_impro05_M051     neu  3.0  2.5  2.5\n",
       "10037      375.10    385.14  Ses05F_impro05_M052     neu  3.5  3.0  3.5\n",
       "10038      386.39    388.27  Ses05F_impro05_M053     neu  4.0  2.5  3.0"
      ]
     },
     "execution_count": 68,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "\n",
    "df_iemocap = pd.DataFrame(columns=['start_time', 'end_time', 'wav_file', 'emotion', 'val', 'act', 'dom'])\n",
    "\n",
    "df_iemocap['start_time'] = start_times\n",
    "df_iemocap['end_time'] = end_times\n",
    "df_iemocap['wav_file'] = wav_file_names\n",
    "df_iemocap['emotion'] = emotions\n",
    "df_iemocap['val'] = vals\n",
    "df_iemocap['act'] = acts\n",
    "df_iemocap['dom'] = doms\n",
    "\n",
    "df_iemocap.tail()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_iemocap.to_csv('data/pre-processed/df_iemocap.csv', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
